Time Until Large Events#

import pandas as pd
import numpy as np
import datetime as dt
csv_file = "../datasets/Formatted_ETAS_Output.csv"
etas = pd.read_csv(csv_file, sep = ',', lineterminator='\n')
csv_file = "../datasets/All (1960-2023).csv"
usgs = pd.read_csv(csv_file, sep = ',', lineterminator='\n', dtype={'time':str})
C:\Users\Vishal\AppData\Local\Temp\ipykernel_17496\2239509985.py:4: DtypeWarning: Columns (1,2,3,4,6,7,8,9,15,16,17,18) have mixed types. Specify dtype option on import or set low_memory=False.
  usgs = pd.read_csv(csv_file, sep = ',', lineterminator='\n', dtype={'time':str})

Data Filtering#

  1. Converting the date columns to datetime

  2. Date > 1960-01-01 and < 2023-01-01

  3. Longitude > -123 and < -113

  4. Latitude > 29 and < 39

etas["Date"] = pd.to_datetime(etas["Date"], errors="coerce", format="%m/%d/%y")
etas.loc[etas["Date"].dt.year > pd.Timestamp.now().year, "Date"] -= pd.DateOffset(years=100)
#converting the Date column into datetime format
usgs["Date"] = pd.to_datetime(usgs["time"], errors="coerce").dt.strftime("%Y-%m-%d")
usgs.drop(columns=["time"], inplace=True)
etas = etas[(etas['Date'] > pd.to_datetime('1960-01-01')) & (etas['Date'] < pd.to_datetime('2023-01-01'))]

#filter the dataset by X > -123 and X < -113 and Y > 29 and Y < 39
etas = etas[etas['X'] > -123]
etas = etas[etas['X'] < -113]
etas = etas[etas['Y'] < 39]
etas = etas[etas['Y'] > 29]
usgs = usgs[(pd.to_datetime(usgs['Date']) > pd.to_datetime('1960-01-01')) & (pd.to_datetime(usgs['Date']) < pd.to_datetime('2023-01-01'))]

usgs['longitude'] = pd.to_numeric(usgs['longitude'], errors='coerce')
usgs['latitude'] = pd.to_numeric(usgs['latitude'], errors='coerce')
usgs['mag'] = pd.to_numeric(usgs['mag'], errors='coerce')

#filter the dataset by X > -123 and X < -113 and Y > 29 and Y < 39
usgs = usgs[usgs['longitude'] > -123]
usgs = usgs[usgs['longitude'] < -113]
usgs = usgs[usgs['latitude'] < 39]
usgs = usgs[usgs['latitude'] > 29]
max_mag_etas = pd.DataFrame(etas.groupby(etas['Date'].dt.to_period('D')).Magnitude.max())
max_mag_etas.reset_index(inplace=True)

Data Grouping And Merging#

Data is grouped into 1 day chunks based on the max magnitude

time = []
for i in usgs['Date']:
    time.append(pd.to_datetime(i))
usgs['Date'] = time
max_mag_usgs = pd.DataFrame(usgs.groupby(usgs['Date'].dt.to_period('D')).mag.max())
max_mag_usgs.reset_index(inplace=True)
large_earthquake = 6

Large Events#

A label is added to Large Event data

large_mag_etas = max_mag_etas.copy()
large_mag_etas["Large Event"] = (large_mag_etas["Magnitude"] > large_earthquake).astype(int)
large_mag_etas["Date"] = large_mag_etas["Date"].dt.to_timestamp()
large_mag_etas['time_diff'] = large_mag_etas.loc[large_mag_etas['Large Event'] == 1, 'Date'].diff().dt.days
large_mag_etas['time_diff'].iloc[0] = pd.NA
large_mag_etas.head()
# large_mag_etas.to_csv("large_mag_etas.csv")
C:\Users\Vishal\AppData\Local\Temp\ipykernel_17496\1056532160.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  large_mag_etas['time_diff'].iloc[0] = pd.NA
Date Magnitude Large Event time_diff
0 1960-01-02 4.25 0 NaN
1 1960-01-03 3.90 0 NaN
2 1960-01-04 4.24 0 NaN
3 1960-01-05 3.40 0 NaN
4 1960-01-06 3.47 0 NaN
import plotly.express as px
import plotly.graph_objects as go
fig = go.Figure(data=[go.Bar(
    x=large_mag_etas['Date'],
    y=large_mag_etas['time_diff'],
)])
# Customize the bar appearance
fig.update_traces(marker_line_color='black', marker_line_width=1)  # Set bar color to red and make the bar border black and thicker
# Customize the plot layout
fig.update_layout(
    title='Time Difference Bar Chart (ETAS)',
    xaxis_title='Date',
    yaxis_title='Time Difference (Days)',
)

# Show the plot
fig.show()
large_mag_usgs = max_mag_usgs.copy()
large_mag_usgs["Large Event"] = (large_mag_usgs["mag"] > large_earthquake).astype(int)
large_mag_usgs["Date"] = large_mag_usgs["Date"].dt.to_timestamp()
large_mag_usgs['time_diff'] = large_mag_usgs.loc[large_mag_usgs['Large Event'] == 1, 'Date'].diff().dt.days
large_mag_usgs['time_diff'].iloc[0] = pd.NA
large_mag_usgs.head()
C:\Users\Vishal\AppData\Local\Temp\ipykernel_17496\3186837181.py:5: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Date mag Large Event time_diff
0 1960-01-02 4.04 0 NaN
1 1960-01-05 3.03 0 NaN
2 1960-01-07 3.64 0 NaN
3 1960-01-08 3.10 0 NaN
4 1960-01-11 3.79 0 NaN
fig = go.Figure(data=[go.Bar(
    x=large_mag_usgs['Date'],
    y=large_mag_usgs['time_diff'],
)])
# Customize the bar appearance
fig.update_traces(marker_color='red', marker_line_color='black', marker_line_width=1)  # Set bar color to red and make the bar border black and thicker

# Customize the plot layout
fig.update_layout(
    title='Time Difference Bar Chart (USGS)',
    xaxis_title='Date',
    yaxis_title='Time Difference (Days)',
)

# Show the plot
fig.show()
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[15], line 1
----> 1 import tensorflow as tf
      2 from sklearn.model_selection import train_test_split
      3 from sklearn.preprocessing import StandardScaler

ModuleNotFoundError: No module named 'tensorflow'
sequence_length = 10  # You can adjust this based on your data and requirements
# Create sequences of data for training
def create_sequences(data, sequence_length):
    sequences = []
    for i in range(len(data) - sequence_length):
        sequence = data[i:i+sequence_length]
        sequences.append(sequence)
    return np.array(sequences)
# Extract the relevant columns and scale the data
large_events_etas =  large_mag_etas[(large_mag_etas['Large Event'] == 1) & ~large_mag_etas['time_diff'].isna()]
data = large_events_etas[['Large Event', 'time_diff']]
scaler = StandardScaler()
data = scaler.fit_transform(data)
print(data)
[[ 0.         -0.93676124]
 [ 0.          0.54943116]
 [ 0.         -0.53469953]
 [ 0.          0.36515287]
 [ 0.         -1.01813088]
 [ 0.         -0.6687201 ]
 [ 0.          2.42811237]
 [ 0.          0.78636038]
 [ 0.          0.11625753]
 [ 0.          0.78636038]
 [ 0.          1.43492421]
 [ 0.         -0.95112059]
 [ 0.         -1.09232083]
 [ 0.         -0.83145937]
 [ 0.         -1.03727667]
 [ 0.         -0.30255676]
 [ 0.          2.34434952]
 [ 0.         -0.34324157]
 [ 0.          0.60686854]
 [ 0.          1.57373123]
 [ 0.         -0.57777757]
 [ 0.          0.53267859]
 [ 0.         -0.42939765]
 [ 0.         -0.43179088]
 [ 0.         -0.65675398]
 [ 0.         -0.55623855]
 [ 0.         -1.04206312]
 [ 0.          0.35079353]
 [ 0.         -0.91043577]
 [ 0.          0.79114683]
 [ 0.         -0.40307218]
 [ 0.         -0.19007521]
 [ 0.         -0.43657733]
 [ 0.          0.76242814]
 [ 0.          3.01923882]
 [ 0.         -0.20443455]
 [ 0.         -0.65675398]
 [ 0.         -0.52751986]
 [ 0.         -0.84342549]
 [ 0.         -0.80034745]
 [ 0.          1.53543964]
 [ 0.          0.04924724]
 [ 0.         -0.1015259 ]
 [ 0.         -1.04445634]
 [ 0.         -0.50358761]]
# Create sequences
sequences = create_sequences(data, sequence_length)

# Split the data into training and testing sets
X = sequences[:, :, :-1]  # Input features (all columns except the last one)
y = sequences[:, -1, -1]  # Target variable (time_diff for the last time step)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(64, input_shape=(X.shape[1], X.shape[2])),
    tf.keras.layers.Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Make predictions
predicted_time_until_next_large_event = model.predict(X_test)

# Inverse transform the predictions to get them in the original scale
predicted_time_until_next_large_event = scaler.inverse_transform(np.column_stack((X_test[:, -1, :], predicted_time_until_next_large_event)))
Epoch 1/10
1/1 [==============================] - 2s 2s/step - loss: 1.1498 - val_loss: 0.4267
Epoch 2/10
1/1 [==============================] - 0s 28ms/step - loss: 1.1486 - val_loss: 0.4338
Epoch 3/10
1/1 [==============================] - 0s 27ms/step - loss: 1.1475 - val_loss: 0.4417
Epoch 4/10
1/1 [==============================] - 0s 28ms/step - loss: 1.1465 - val_loss: 0.4504
Epoch 5/10
1/1 [==============================] - 0s 28ms/step - loss: 1.1456 - val_loss: 0.4601
Epoch 6/10
1/1 [==============================] - 0s 29ms/step - loss: 1.1449 - val_loss: 0.4709
Epoch 7/10
1/1 [==============================] - 0s 29ms/step - loss: 1.1444 - val_loss: 0.4825
Epoch 8/10
1/1 [==============================] - 0s 33ms/step - loss: 1.1442 - val_loss: 0.4943
Epoch 9/10
1/1 [==============================] - 0s 25ms/step - loss: 1.1443 - val_loss: 0.5038
Epoch 10/10
1/1 [==============================] - 0s 29ms/step - loss: 1.1447 - val_loss: 0.5089
1/1 [==============================] - 0s 319ms/step
print(predicted_time_until_next_large_event)
[[  1.        530.0596171]
 [  1.        530.0596171]
 [  1.        530.0596171]
 [  1.        530.0596171]
 [  1.        530.0596171]
 [  1.        530.0596171]
 [  1.        530.0596171]]
# Extract the relevant columns and scale the data
large_events_usgs =  large_mag_usgs[(large_mag_usgs['Large Event'] == 1) & ~large_mag_usgs['time_diff'].isna()]
data = large_events_usgs[['Large Event', 'time_diff']]
scaler = StandardScaler()
data = scaler.fit_transform(data)
print(data)
[[ 0.          0.65769802]
 [ 0.          0.36480705]
 [ 0.          0.20122433]
 [ 0.         -0.94541263]
 [ 0.         -0.76313474]
 [ 0.         -0.85193679]
 [ 0.         -0.89400092]
 [ 0.         -0.9080223 ]
 [ 0.         -0.95164435]
 [ 0.          1.07989723]
 [ 0.         -0.60889961]
 [ 0.         -0.95320228]
 [ 0.         -0.93606505]
 [ 0.          0.69041456]
 [ 0.         -0.39857897]
 [ 0.         -0.62447892]
 [ 0.         -0.0137701 ]
 [ 0.         -0.19137419]
 [ 0.          0.12488573]
 [ 0.          0.4738622 ]
 [ 0.         -0.85349472]
 [ 0.         -0.45310654]
 [ 0.         -0.57462513]
 [ 0.         -0.58553065]
 [ 0.          1.94143288]
 [ 0.          1.4241999 ]
 [ 0.          2.61913272]
 [ 0.          1.5410447 ]
 [ 0.          1.80900878]
 [ 0.         -0.95320228]
 [ 0.         -0.46712792]]
# Create sequences
sequences = create_sequences(data, sequence_length)

# Split the data into training and testing sets
X = sequences[:, :, :-1]  # Input features (all columns except the last one)
y = sequences[:, -1, -1]  # Target variable (time_diff for the last time step)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(64, input_shape=(X.shape[1], X.shape[2])),
    tf.keras.layers.Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Make predictions
predicted_time_until_next_large_event = model.predict(X_test)

# Inverse transform the predictions to get them in the original scale
predicted_time_until_next_large_event = scaler.inverse_transform(np.column_stack((X_test[:, -1, :], predicted_time_until_next_large_event)))
Epoch 1/10
1/1 [==============================] - 2s 2s/step - loss: 0.8276 - val_loss: 2.4585
Epoch 2/10
1/1 [==============================] - 0s 29ms/step - loss: 0.8274 - val_loss: 2.4760
Epoch 3/10
1/1 [==============================] - 0s 32ms/step - loss: 0.8273 - val_loss: 2.4897
Epoch 4/10
1/1 [==============================] - 0s 46ms/step - loss: 0.8274 - val_loss: 2.4950
Epoch 5/10
1/1 [==============================] - 0s 40ms/step - loss: 0.8274 - val_loss: 2.4937
Epoch 6/10
1/1 [==============================] - 0s 41ms/step - loss: 0.8274 - val_loss: 2.4886
Epoch 7/10
1/1 [==============================] - 0s 28ms/step - loss: 0.8273 - val_loss: 2.4821
Epoch 8/10
1/1 [==============================] - 0s 30ms/step - loss: 0.8273 - val_loss: 2.4756
Epoch 9/10
1/1 [==============================] - 0s 33ms/step - loss: 0.8273 - val_loss: 2.4700
Epoch 10/10
1/1 [==============================] - 0s 28ms/step - loss: 0.8273 - val_loss: 2.4660
1/1 [==============================] - 0s 284ms/step
print(predicted_time_until_next_large_event)
[[  1.         605.43713783]
 [  1.         605.43713783]
 [  1.         605.43713783]
 [  1.         605.43713783]
 [  1.         605.43713783]]